0.1 Packages and settings preparation

0.1.1 set directory

setwd("~/Google Drive/Vibrant/ALS Project - Chao Gao/Paper outline/Code/V1.6")

0.1.2 Request more heap memory

options( java.parameters = "-Xmx12g" )

0.1.3 Check Current Java Version

Sys.getenv("JAVA_HOME")

0.1.4 Install packages

install.packages(c("dplyr","plyr","mi","reshape","reshape2","ggplot2","MASS","magrittr","psych","bartMachine","SuperLearner", "caret","e1071","rJava"))

0.1.5 Load packages

library(dplyr)
library(plyr)
library(mi)
library(reshape)
library(reshape2)
library(ggplot2)
library(MASS)
library(magrittr)
library(psych) 
library(bartMachine)
library(SuperLearner)
library(caret)
library(e1071)

0.1.6 Personalized functions

##Replace missing with the median of the column
fun<-function(x){
  x[is.na(x)] =median(x, na.rm=TRUE) #convert the item with NA to median value from the column
  x #display the column
}

0.2 Data preprocessing

The flowchart shows the steps of data preprocessing and analysis ### Load dataset #### Load raw PRO-ACT training dataset (3103974 obs. of 6 variables) ####

df <- read.table("Data/all_forms_PROACT.txt", header=TRUE, sep="|", na.strings=c("NA","","NaN"), fill = TRUE, quote = "", stringsAsFactors = FALSE, comment.char="")
length(unique(df$feature_name))#6318
## [1] 6318
length(unique(df$SubjectID))#8635
## [1] 8635

0.2.0.1 Load PRO-ACT training ALSFRS_slope dataset (2424 obs. of 2 variables)

df_slope <- read.table("Data/ALSFRS_slope_PROACT.txt", header=TRUE, sep="|", na.strings=c("NA","","NaN"), fill = TRUE, quote = "", stringsAsFactors = FALSE, comment.char="")
#remove study subjects with NA in ALSFRS_slope
df_slope <- df_slope[!is.na(df_slope$ALSFRS_slope), ]

0.2.0.2 Load raw PRO-ACT testing dataset (133724 obs. of 6 variables)

df_v <- read.table("Data/all_forms_validate_spike.txt", header=TRUE, sep="|", na.strings=c("NA","","NaN"), fill = TRUE, quote = "", stringsAsFactors = FALSE, comment.char="")
length(unique(df_v$SubjectID))#200
## [1] 200
length(unique(df_v$feature_name))#768
## [1] 768

0.2.0.3 Load PRO-ACT testing ALSFRS_slope dataset (101 obs. of 2 variables)

df_v_slope <- read.table("Data/ALSFRS_slope_validate_spike.txt",header=TRUE,sep="|", na.strings=c("NA","","NaN"), fill = TRUE, quote = "", stringsAsFactors = FALSE, comment.char="")

0.2.1 Analysis of missingness

0.2.1.1 G enerate tile plot to present missing pattern of raw PRO-ACT training dataset (Figure 2)

This figure shows the missingness of orgininal training dataset

This figure shows the missingness of orgininal training dataset

0.2.2 Deal with longitudinal data

0.2.2.1 Preprocessing raw training data(2424 obs. of 172 variables)

##read data
df0 <- df

#remove Adverse Event
df1 <- df0[df0$form_name != "Adverse Event", ] 

# select features
df1 <- df1[df1$feature_name %in% c("Age","Gender", "onset_delta" ,"onset_site","ALSFRS_Total","mouth","hands", "trunk","leg","respiratory", "fvc_percent","bp_diastolic","bp_systolic", "BMI", "pulse", "if_use_Riluzole", "Urine Ph","Albumin", "Protein", "Sodium", "Potassium","Bicarbonate", "Chloride", "Blood Urea Nitrogen (BUN)", "Creatinine","Alkaline Phosphatase","ALT(SGPT)", "Gamma-glutamyltransferase", "AST(SGOT)", "Bilirubin (total)", "White Blood Cell (WBC)", "Neutrophils","Lymphocytes", "Monocytes", "Eosinophils","Basophils","Red Blood Cells (RBC)","Hemoglobin", "Hematocrit","Platelets","CK","Triglycerides", "Total Cholesterol", "Glucose","HbA1c (Glycated Hemoglobin)", "Calcium", "Phosphorus", "Race"),]
### Organized the data
# change "feature delta" as numeric feature
df1$feature_delta<-as.numeric(df1$feature_delta)

#Organized the data format 
colnames(df1)[1]<-"SubjectID"
#Read slope values 
slope <- df_slope
dff<-merge(slope,df1,by="SubjectID",all.x=TRUE)
feature_lst <- unique(dff$feature_name)
list_length<-length(feature_lst) 
lengthh<-c(1:list_length)

IID<-order(dff$feature_delta)
dff<-dff[IID,]

#deal with constant numeric features
constant_id<-c(30,37,41,45)
for (i in constant_id){ 
  aaa<-dff[which(dff$feature_name==feature_lst[i]),]
  colnames(aaa)[1]<-"SubjectID"
  ID_lst <- unique(aaa$ID)
  aaa$feature_value<-as.numeric(as.factor(aaa$feature_value))
  cof<-ddply(aaa,~SubjectID,summarise,mean=mean(feature_value) )
  colnames(cof)[2]<-paste(feature_lst[i],"mean",sep="_") 
  slope<-merge(slope,cof,by="SubjectID",all=TRUE)
} 

# deal with constant categorical feature
for (i in c(32,11)){ 
  aaa<-dff[which(dff$feature_name==feature_lst[i]),]
  colnames(aaa)[1]<-"SubjectID"
  ID_lst <- unique(aaa$ID)
  aaa$feature_value<-as.numeric(aaa$feature_value)
  cof<-ddply(aaa,~SubjectID,summarise,mean=mean(feature_value) )
  colnames(cof)[2]<-paste(feature_lst[i],"mean",sep="_") 
  slope<-merge(slope,cof,by="SubjectID",all=TRUE)
} 


# deal with time-varying features
for (i in lengthh[-c(30,32,37,41,45,11)]){
  aaa<-dff[which(dff$feature_name==feature_lst[i]),]
   aaa<-aaa[,-6]
  colnames(aaa)[1]<-"SubjectID"
  ID_lst <- unique(aaa$ID)
  aaa$feature_value<-as.numeric(aaa$feature_value)
  cof<-ddply(aaa,~SubjectID,summarise,max=max(feature_value),min=min(feature_value),
             median=median(feature_value)
  )
  aaa<-na.omit(aaa)
  mods = dlply(aaa, .(SubjectID), lm, formula = feature_value ~ feature_delta)
  coefs = ldply(mods, coef)
  cof<-merge(cof,coefs[,c(1,3)],by="SubjectID",all.x=TRUE)
  colnames(cof)[2:5]<-paste(feature_lst[i],c("max","min","median","slope"),sep="_")
  slope<-merge(slope,cof,by="SubjectID",all=TRUE)
}
# write dataset into local directory 
write.csv(slope,"slope_raw_training_new.csv", row.names = FALSE)
#172 covariates(including SubjectID)

0.2.2.2 Preprocessing raw testing dataset (101 obs. of 172 variables)

df0_v <- df_v

#remove Adverse Event
df1_v <- df0_v[df0_v$form_name != "Adverse Event", ] 

df1_v <- df1_v[df1_v$feature_name %in% c("Age","Gender", "onset_delta" ,"onset_site","ALSFRS_Total","mouth","hands","trunk","leg","respiratory", "fvc_percent","bp_diastolic","bp_systolic", "BMI",
"pulse", "if_use_Riluzole", "Urine Ph","Albumin", "Protein", "Sodium", "Potassium","Bicarbonate", "Chloride","Blood Urea Nitrogen (BUN)", "Creatinine","Alkaline Phosphatase","ALT(SGPT)", "Gamma-glutamyltransferase", "AST(SGOT)","Bilirubin (total)", "White Blood Cell (WBC)", "Neutrophils","Lymphocytes", "Monocytes","Eosinophils","Basophils","Red Blood Cells (RBC)","Hemoglobin", "Hematocrit","Platelets","CK","Triglycerides","Total Cholesterol", "Glucose","HbA1c (Glycated Hemoglobin)", "Calcium", "Phosphorus", "Race"),]

df1<-df1_v
df1$feature_delta<-as.numeric(df1$feature_delta)

colnames(df1)[1]<-"SubjectID"
slope<- df_v_slope

dff<-merge(slope,df1,by="SubjectID",all.x=TRUE)
feature_lst <- unique(dff$feature_name)
list_length<-length(feature_lst) 
lengthh<-c(1:list_length)

IID<-order(dff$feature_delta)
dff<-dff[IID,]

## deal with constant numeric features
constant_id<-c(7,12,13,4)
for (i in constant_id){ 
  aaa<-dff[which(dff$feature_name==feature_lst[i]),]
  colnames(aaa)[1]<-"SubjectID"
  ID_lst <- unique(aaa$ID)
  aaa$feature_value<-as.numeric(as.factor(aaa$feature_value))
  cof<-ddply(aaa,~SubjectID,summarise,mean=mean(feature_value) )
  colnames(cof)[2]<-paste(feature_lst[i],"mean",sep="_") 
  slope<-merge(slope,cof,by="SubjectID",all=TRUE)
} 

#deal with constant categorical features
for (i in c(8,11)){ 
  aaa<-dff[which(dff$feature_name==feature_lst[i]),]
  colnames(aaa)[1]<-"SubjectID"
  ID_lst <- unique(aaa$ID)
  aaa$feature_value<-as.numeric(aaa$feature_value)
  cof<-ddply(aaa,~SubjectID,summarise,mean=mean(feature_value) )
  colnames(cof)[2]<-paste(feature_lst[i],"mean",sep="_") 
  slope<-merge(slope,cof,by="SubjectID",all=TRUE)
} 


#deal with time-varying features
for (i in lengthh[-c(7,12,13,4,8,11)])  {
  aaa<-dff[which(dff$feature_name==feature_lst[i]),]
  aaa<-aaa[,-6]
  colnames(aaa)[1]<-"SubjectID"
  ID_lst <- unique(aaa$ID)
  aaa$feature_value<-as.numeric(aaa$feature_value)
  cof<-ddply(aaa,~SubjectID,summarise,max=max(feature_value),min=min(feature_value),
             median=median(feature_value)
  )
  aaa<-na.omit(aaa)
  mods = dlply(aaa, .(SubjectID), lm, formula = feature_value ~ feature_delta)
  coefs = ldply(mods, coef)
  cof<-merge(cof,coefs[,c(1,3)],by="SubjectID",all.x=TRUE)
  colnames(cof)[2:5]<-paste(feature_lst[i],c("max","min","median","slope"),sep="_")
  slope<-merge(slope,cof,by="SubjectID",all=TRUE)
}
# write file 
write.csv(slope,"slope_raw_validate_new.csv", row.names = FALSE)

0.2.2.3 Present Missing data

## quartz_off_screen 
##                 2
Figure 3 shows the disconcruency of the missing pattern between training and testing dataset

Figure 3 shows the disconcruency of the missing pattern between training and testing dataset

0.2.2.4 Descriptive Statistics for the raw 175 variables within the training and testing data collections (Table S.3)

Var n.x mean.x sd.x n.y mean.y sd.y
Age_mean 2424 54.5375412541254 11.4590297077509 101 57.2573714905737 10.7192613017683
Albumin_max 2109 47.0136320531057 3.32140960354093 72 45.2361111111111 3.18222641325644
Albumin_median 2109 43.9532954006638 2.72994017117937 72 42.1666666666667 3.21089693931923
Albumin_min 2109 40.7584589853011 3.28539649348497 72 38.9027777777778 4.14239408977371
Albumin_slope 2080 -0.00066304524359485 0.00964402390052194 71 -0.00186345739650714 0.00971895029920851
Alkaline.Phosphatase_max 2109 106.724513987672 61.7278755340277 37 85.945945945946 41.41855860598
Alkaline.Phosphatase_median 2109 78.4561403508772 22.5725342855288 37 66.6216216216216 20.9862406661436
Alkaline.Phosphatase_min 2109 65.918018018018 18.8045151240071 37 57.0540540540541 17.7418932128106
Alkaline.Phosphatase_slope 2079 0.0245522287073217 0.107121072718988 36 0.0178481107415077 0.0454421655158345
ALSFRS_slope 2424 -0.730785587584841 0.62748149433062 101 -0.785880065754675 0.655454936567462
ALSFRS_Total_max 2424 31.7438118811881 5.27392088581054 101 30.0990099009901 5.96239037718103
ALSFRS_Total_median 2424 27.1917285478548 6.60869643850714 101 22.460396039604 8.62661091284313
ALSFRS_Total_min 2424 20.1295379537954 8.59290047306708 101 15.6039603960396 9.31029452586844
ALSFRS_Total_slope 2424 -0.0232826672515287 0.017266353009286 101 -0.0251154776907436 0.0171240457379447
ALT.SGPT._max 2412 53.9509950248756 43.8185743794078 78 59.0769230769231 34.5497079247035
ALT.SGPT._median 2412 32.9956467661692 15.5289390716099 78 32.3974358974359 14.8204314426129
ALT.SGPT._min 2412 23.174087893864 11.3071845494832 78 22.3589743589744 10.7413200946605
ALT.SGPT._slope 2384 0.00127031578973447 0.0806063706450099 77 -0.0176662874687794 0.0782988486519406
AST.SGOT._max 2413 42.9656029838375 34.1855132163785 78 48.3333333333333 32.3907420902976
AST.SGOT._median 2413 29.1918773311231 9.60296304111526 78 28.8717948717949 9.35845887850946
AST.SGOT._min 2413 21.7508910070452 7.50312015209157 78 21.0897435897436 6.26510827931822
AST.SGOT._slope 2382 -0.000658123656127784 0.0598963682399829 77 -0.0122223386334234 0.0503250399310411
Basophils_max 1841 1.08853883758827 1.11627490221824 78 1.51538461538462 0.758857124802908
Basophils_median 1841 0.478625746876697 0.243576636386292 78 0.675 0.210711534210519
Basophils_min 1841 0.149158066268332 0.218220564281621 78 0.319230769230769 0.205797588306198
Basophils_slope 1780 -0.000125764657213237 0.00136122634354122 77 9.58055039710486e-05 0.0022278482515412
Bicarbonate_max 2060 30.6850485436893 3.43297153888046 78 27.9217948717949 2.74570017734447
Bicarbonate_median 2060 26.8749757281553 2.41353713159108 78 24.4891025641026 2.20281177217431
Bicarbonate_min 2060 23.1913106796117 2.60953480706033 78 21.1871794871795 2.28571844821466
Bicarbonate_slope 2032 -0.00101449911256295 0.0130995854343925 77 0.00190032515538861 0.00822812361338164
Blood.Urea.Nitrogen..BUN._max 2412 7.36933679519071 2.29440859659421 78 7.71951282051282 2.13834432094049
Blood.Urea.Nitrogen..BUN._median 2412 5.59243985074627 1.35901812948061 78 5.75203205128205 1.52476657855858
Blood.Urea.Nitrogen..BUN._min 2412 4.183202960199 1.35129325674648 78 4.20466666666667 1.29009231187546
Blood.Urea.Nitrogen..BUN._slope 2386 9.27453772868006e-06 0.00594527686260093 77 -0.000867295953494294 0.00503249957013987
BMI_max 1363 0.00257989787651352 0.00043693592222543 78 0.0026310647019252 0.00040534507266651
BMI_median 1363 0.00257726803310333 0.000435285567144275 78 0.0026310647019252 0.00040534507266651
BMI_min 1363 0.00257463509610351 0.000434026355839315 78 0.0026310647019252 0.00040534507266651
BMI_slope 242 3.49831438107787e-07 4.80387777606735e-06 0 NaN NA
bp_diastolic_max 2176 91.9954044117647 9.34311361240496 78 90.5641025641026 8.73058475837158
bp_diastolic_median 2176 81.1148897058823 7.76115519229348 78 80.1410256410256 7.30099505578033
bp_diastolic_min 2176 69.7720588235294 8.99585961407127 78 67.5512820512821 7.31596268602499
bp_diastolic_slope 2176 -0.00128472740620271 0.0218826468989993 78 -0.00667326941155003 0.0291790957823273
bp_systolic_max 2176 147.091452205882 16.5452608210706 78 143.5 14.0164560057469
bp_systolic_median 2176 129.504595588235 12.8633928425817 78 125.410256410256 11.2295249722431
bp_systolic_min 2176 113.931066176471 11.8678094489092 78 108.923076923077 11.1760054090492
bp_systolic_slope 2176 -0.00799534171637676 0.0342797837462885 78 -0.00965396319905002 0.0353097583980406
Calcium_max 2108 2.47509643026565 0.185095531617863 72 2.54430555555556 0.107252189168227
Calcium_median 2108 2.34559908206831 0.090798733570523 72 2.41982638888889 0.0875812765266859
Calcium_min 2108 2.22246499122391 0.177431835802872 72 2.28527777777778 0.10743123869952
Calcium_slope 2079 4.85624997284194e-05 0.000387001055673194 71 -1.03281579376726e-05 0.000296532203830862
Chloride_max 2250 107.072577777778 2.72726774452538 78 105.24358974359 2.03977269828593
Chloride_median 2250 103.427733333333 2.40089925349514 78 102.339743589744 1.82246726894805
Chloride_min 2250 99.3092888888889 3.52116233627607 78 99.2179487179487 2.94824374707065
Chloride_slope 2219 -0.00117245819928419 0.0128013752777168 77 -0.00331822886696961 0.0109623128378599
CK_max 1945 512.351670951157 455.299383403162 36 1121.77777777778 3658.2300100389
CK_median 1945 301.531053984576 264.692112745275 36 264.194444444444 244.596819211236
CK_min 1945 170.035526992288 159.869237968098 36 142.055555555556 101.857307597988
CK_slope 1915 -0.100658972347894 0.574678418733238 36 0.222995689834011 1.94438136425215
Creatinine_max 2412 79.3120580431177 20.4044964412935 78 76.72 36.889060497871
Creatinine_median 2412 65.7550116086235 17.898575703638 78 57.8228205128205 16.8596654538018
Creatinine_min 2412 52.5264943615257 18.5329613539573 78 44.0882051282051 17.636321858161
Creatinine_slope 2385 -0.0290414087540594 0.0433624748291318 77 -0.0441979350586713 0.0410528373669304
Eosinophils_max 1776 4.19279279279279 2.5296247840395 78 4.36538461538461 2.06591553385103
Eosinophils_median 1776 2.30242117117117 1.43340913219548 78 2.19358974358974 1.06180313967605
Eosinophils_min 1776 1.18220720720721 1.09648291708696 78 0.993589743589744 0.735613489631074
Eosinophils_slope 1715 -0.000276042381163875 0.00373969605468944 77 -0.000449842547448364 0.00614833883409007
fvc_percent_max 2257 87.7004342422103 17.1008822041781 64 92.9973958333333 17.4651704846968
fvc_percent_median 2257 75.5595081800905 18.9957503786435 64 77.640625 19.3027540219184
fvc_percent_min 2257 55.197839318849 25.2909433628203 64 54.3828125 27.7152578917573
fvc_percent_slope 2254 -0.0663665666490335 0.19170083274742 62 -0.0816629530015356 0.104075541116402
Gamma.glutamyltransferase_max 1867 58.5538296732726 79.0328778917009 37 46.4864864864865 31.6057498475107
Gamma.glutamyltransferase_median 1867 34.8623460096411 36.0274704864039 37 28.8918918918919 18.2570896180449
Gamma.glutamyltransferase_min 1867 24.3656132833423 24.0468871711512 37 20.2702702702703 11.466493818098
Gamma.glutamyltransferase_slope 1838 0.0098813623526445 0.143759624959074 36 0.00607747015560745 0.032959323679861
Glucose_max 2170 7.17329677419355 2.57277387970439 78 7.78570512820513 2.06115963520333
Glucose_median 2170 5.49431923963134 1.2772494555249 78 5.4673717948718 1.06605146691106
Glucose_min 2170 4.26760151612903 1.27924583695133 78 4.35230769230769 0.703532687057426
Glucose_slope 2196 0.000515908627398624 0.00469970375778664 77 -0.000841185906820127 0.00882693606967879
hands_max 2424 6.19884488448845 1.94774457814011 101 5.8019801980198 2.16342229802782
hands_median 2424 4.93719059405941 2.43224315298442 101 3.98019801980198 2.65981276792109
hands_min 2424 3.11716171617162 2.59962266481603 101 2.21782178217822 2.41496981511587
hands_slope 2424 -0.00570983606347995 0.00516913173845415 101 -0.00587949118702615 0.00513066976183382
HbA1c..Glycated.Hemoglobin._max 1511 5.97647915287889 0.913463994691757 NA NA NA
HbA1c..Glycated.Hemoglobin._median 1511 5.44060886829914 0.639825431407871 NA NA NA
HbA1c..Glycated.Hemoglobin._min 1511 4.95290536068829 0.597971928656614 NA NA NA
HbA1c..Glycated.Hemoglobin._slope 1488 -4.54496513342089e-05 0.00124058323118751 NA NA NA
Hematocrit_max 2224 41.9417153776978 12.9306948288911 78 47.7038461538462 3.86514367964965
Hematocrit_median 2224 39.4702032374101 12.118830542526 78 43.6935897435897 3.15355251104107
Hematocrit_min 2224 36.9653210431655 11.5569121896324 78 40.1410256410256 3.35851064306691
Hematocrit_slope 2194 0.000253297128528394 0.010010488485032 77 0.00147936738970224 0.0148984249534962
Hemoglobin_max 2224 152.142805755396 12.7185245173477 78 154.692307692308 11.7067054736571
Hemoglobin_median 2224 144.288893884892 11.6002376323046 78 144.282051282051 10.5123890248755
Hemoglobin_min 2224 135.460922661871 14.8872658504018 78 133.705128205128 11.748566062665
Hemoglobin_slope 2194 -0.000237468012819504 0.0302987113256346 77 -0.0038880080774771 0.0419916531816279
leg_max 2424 5.31848184818482 2.23395660623283 101 4.77227722772277 2.37436807643134
leg_median 2424 4.07054455445545 2.28205481918521 101 3.16831683168317 2.44670924684848
leg_min 2424 2.54001650165017 2.14786828635806 101 1.76237623762376 2.10783545302514
leg_slope 2424 -0.00508442048414411 0.00465417680089854 101 -0.00485356584888292 0.00423950162322656
Lymphocytes_max 1851 32.3286331712588 7.76821112114374 78 32.9987179487179 7.45470292752137
Lymphocytes_median 1851 25.8169097784981 6.31685372094809 78 25.2858974358974 5.92089573688022
Lymphocytes_min 1851 18.9065207995678 6.37767534510343 78 17.3807692307692 5.734358911672
Lymphocytes_slope 1779 -0.00478133886995476 0.0159349899736102 77 -0.00741887936214243 0.0385628828700545
Monocytes_max 1853 8.72142471667566 2.71222329547321 78 9.45769230769231 4.57663516249541
Monocytes_median 1853 5.96675661090124 1.77869519349936 78 6.56025641025641 1.57734280453246
Monocytes_min 1853 3.85895844576363 1.85833904941223 78 4.58717948717949 1.32785922567497
Monocytes_slope 1779 -0.00184840813557744 0.00573745431276936 77 -0.00157236519435078 0.0081939387424273
mouth_max 2424 10.7570132013201 1.89421875798073 101 10.5841584158416 1.89877500896064
mouth_median 2424 9.72524752475248 2.75501942032917 101 8.56930693069307 3.40369630179478
mouth_min 2424 7.84034653465347 3.71660194923749 101 6.32673267326733 4.13789538507461
mouth_slope 2424 -0.00507728339996061 0.00628746488325555 101 -0.00637143284150849 0.00636442787700772
Neutrophils_max 1786 73.5036394176932 7.59618459232422 42 74.8547619047619 6.58113350682167
Neutrophils_median 1786 65.0837066069429 7.3431646930973 42 64.9535714285714 6.04990821193366
Neutrophils_min 1786 57.0599048152296 8.91092977206641 42 56.2595238095238 7.81234356405515
Neutrophils_slope 1714 0.00669496389080631 0.0189001866113887 41 0.0144226259889641 0.0624384261323121
onset_delta_mean 2418 -686.872208436725 414.451687020307 92 -530.989130434783 231.437480916255
Phosphorus_max 2108 1.3922770256167 0.230172265157576 36 1.41924166666667 0.133117508508192
Phosphorus_median 2108 1.19182457305503 0.121608040253329 36 1.21089583333333 0.147335204974914
Phosphorus_min 2108 1.02383847248577 0.133486006088975 36 0.999513888888889 0.14844392717748
Phosphorus_slope 2077 5.64147967508512e-05 0.000425834118538429 36 0.00014922502619943 0.000302480822252272
Platelets_max 2106 285.874169040836 72.0559879135821 78 311.884615384615 78.7676149368328
Platelets_median 2106 239.102801519468 54.0511304228343 78 282.666666666667 65.079604933919
Platelets_min 2106 208.625449667616 50.6622707272523 78 261.153846153846 71.9935200769273
Platelets_slope 2078 0.0186003620328334 0.0992053808162082 39 0.0326726154314932 0.0989955472289394
Potassium_max 2411 4.62399834093737 1.2773264765901 78 4.64358974358974 0.383622427871686
Potassium_median 2411 4.18821858150145 0.248106500529091 78 4.16923076923077 0.216859336962862
Potassium_min 2411 3.85654500207383 0.26610617781299 78 3.77435897435897 0.253513440016628
Potassium_slope 2384 -0.000426652821675778 0.0241506854806198 77 -7.00343910746221e-05 0.00115760742615253
Protein_max 2043 76.8311306901615 4.25388686052967 36 74.0277777777778 5.07366371491681
Protein_median 2043 72.2857807146353 3.56415385033174 36 69.4583333333333 3.93405461506138
Protein_min 2043 68.0037689672051 4.13294383815223 36 64.8333333333333 4.46894043050795
Protein_slope 2079 -0.00106407224981409 0.0107602891424065 36 -0.000844172540810016 0.00643223506827082
pulse_max 2176 90.5772058823529 11.9819601837299 78 95.9615384615385 13.6728399919897
pulse_median 2176 76.8559283088235 9.11825461027748 78 78.7692307692308 9.88133893462111
pulse_min 2176 65.5698529411765 8.25689213617804 78 64.7051282051282 8.36366441524781
pulse_slope 2176 0.0109366783402785 0.028729134018368 78 0.0111239035221543 0.0284267227223097
Red.Blood.Cells..RBC._max 1974 205526512.31003 956239460.280868 78 5070.51282051282 426.321924078956
Red.Blood.Cells..RBC._median 1974 4670.78774062817 418.757968545842 78 4716.47435897436 372.235357900968
Red.Blood.Cells..RBC._min 1974 4385.89900202634 460.893142792287 78 4360.12820512821 405.262440509413
Red.Blood.Cells..RBC._slope 2011 -85160.1917018794 549551.166890205 77 0.227030868857412 2.13966702912828
respiratory_max 2424 3.90594059405941 0.308467953791843 101 3.87128712871287 0.391493712251772
respiratory_median 2424 3.58415841584158 0.618435541384743 101 3.12871287128713 1.15467195632901
respiratory_min 2424 2.79909240924092 1.05623124072513 101 2.03960396039604 1.42070962606162
respiratory_slope 2424 -0.00148984643845156 0.00236948213120654 101 -0.0021931899434463 0.00291521931967747
Sodium_max 2411 143.361012028204 2.342410285652 78 143.192307692308 2.19833285302006
Sodium_median 2411 140.132061385317 1.79426317878301 78 140.269230769231 1.48300394446566
Sodium_min 2411 136.791165491497 2.8685634660764 78 137.448717948718 1.95831517764591
Sodium_slope 2384 0.00159240122849962 0.0171628406344419 77 0.000158744398170742 0.0072989488116711
Total.Cholesterol_max 1865 6.65718943699732 1.21795564118192 36 6.62690277777778 1.17291082634763
Total.Cholesterol_median 1865 5.87440393565684 1.03928644879369 36 5.72589722222222 0.9933042446404
Total.Cholesterol_min 1865 5.07456998284182 0.992283893885334 36 4.81631944444444 0.939556206125133
Total.Cholesterol_slope 1835 -0.000145353277074433 0.00200216634153814 36 2.02647546391879e-05 0.00208590906741232
Triglycerides_max 1703 3.13492765120376 2.12890397829411 36 3.31741111111111 1.79473945753818
Triglycerides_median 1703 2.01650785965942 1.23693073131958 36 1.92714722222222 0.970796243661021
Triglycerides_min 1703 1.31833285613623 0.80349613930263 36 1.27751388888889 0.720539952754789
Triglycerides_slope 1673 -0.000180328598627944 0.00239485561856983 36 -0.000304740547514068 0.00197306999278624
trunk_max 2424 6.21452145214521 1.72936198599806 101 5.71287128712871 2.05103209952144
trunk_median 2424 4.91542904290429 2.1348631125207 101 3.7970297029703 2.53197375363745
trunk_min 2424 3.02062706270627 2.36871275208369 101 2.06930693069307 2.22376898864326
trunk_slope 2424 -0.00593485685094305 0.00479970307092377 101 -0.00585878681037928 0.0047303152106807
Urine.Ph_max 2042 6.80551420176298 0.971695384711862 78 7.07692307692308 0.674570200234901
Urine.Ph_median 2042 5.68303134182174 0.646495674028862 78 6.19230769230769 0.572535765517931
Urine.Ph_min 2042 5.19995102840353 0.452955883489195 78 5.56410256410256 0.45839198304858
Urine.Ph_slope 1827 0.000199748777318122 0.00588011875915285 77 0.000867929010297606 0.00442099295374208
White.Blood.Cell..WBC._max 1983 8.97351941502774 2.55746550076234 78 9.34179487179487 2.93952895934544
White.Blood.Cell..WBC._median 1983 6.95655219364599 1.59840051489864 78 6.78910256410256 1.68326421126584
White.Blood.Cell..WBC._min 1983 5.77965708522441 1.44307811759807 78 5.28602564102564 1.51386092770271
White.Blood.Cell..WBC._slope 2011 0.000831804155604706 0.00602416734953845 77 0.00147281746308479 0.00717243640578036
Bilirubin..total._max NA NA NA 72 12.4413888888889 5.98023724282224
Bilirubin..total._median NA NA NA 72 8.13923611111111 3.37452246592667
Bilirubin..total._min NA NA NA 72 5.21527777777778 2.37266592332856
Bilirubin..total._slope NA NA NA 77 -7.9458431254426e-05 0.0151282727240631

0.2.3 deal with missing values in datasets

0.2.3.1 Final testing dataset(78 obs. of 100 variables) and final training data (2223 obs. of 100 variables)

data2<-data2[,order(colnames(data2))]

ID<-which(apply(is.na(data2),1,sum)<136)
data2_1<-data2[ID,]

col_ID<-which(apply(is.na(data2_1),2,sum)>9)
data2_2<-data2_1[,-col_ID]
for(i in 1:ncol(data2_2)){
  m<-median(data2_2[,i])
  for (j in 1:nrow(data2_2)){
    if(is.na(data2_2[j,i])){data2_2[j,i]<-m}
    if(is.infinite(data2_2[j,i])){data2_2[j,i]<-m}
  }
}

data2_2=data.frame(apply(data2_2,2,fun))

#delete redundant BMI_max, BMI_median
data2_2<-data2_2[,-c(35,36)]
col_2<-colnames(data2_2)

data1<-data1[,order(colnames(data1))]
col_chose<-is.element(colnames(data1),col_2)
data1_1<-data1[,col_chose]
data1_2<-data1_1[,which(apply(is.na(data1_1),2,sum)<400)]
data1_3<-data1_2[which(apply(is.na(data1_2),1,sum)<20),]
for(i in 1:ncol(data1_3)){
  m<-median(data1_3[,i])
  for (j in 1:nrow(data1_3)){
    if(is.na(data1_3[j,i])){data1_3[j,i]<-m}
    if(is.infinite(data1_3[j,i])){data1_3[j,i]<-m}
  }
}
data1_3<-data.frame(apply(data1_3,2,fun))

data2_2<-data2_2[,is.element(colnames(data2_2),colnames(data1_3))] 
data2_2<-data2_2[,order(colnames(data2_2))]
data1_3<-data1_3[,order(colnames(data1_3))]
#Delete Race_mean
write.csv(data2_2[,-85],"complete_testing.csv", row.names = FALSE)
write.csv(data1_3[,-85],"complete_training.csv", row.names = FALSE)

0.2.3.2 Basic Patient Demographics (Table 1)

### Gender: Male=2, Female=1
### Onset_site: Bulbar=1, Limb=2, Limb&Bular=3
###Final Training Dataset
df_train <- read.csv("complete_training.csv")
age_final <- df_train$Age_mean #2223 obs
age_stat <- c(min(age_final),median(age_final), max(age_final))
onsetdelta_final <- df_train$onset_delta_mean 
onsetdelta_stat <- c(min(onsetdelta_final),median(onsetdelta_final), max(onsetdelta_final))
gender_final <- df_train$Gender_mean
gender_stat <- c(sum(gender_final==2)/2223,sum(gender_final==1)/2223)
onset_final <- df_train$onset_site_mean
onset_stat <- table(onset_final)

###Raw PRO-ACT Dataset
age_raw <- df[df$feature_name == "Age",] #6565 obs
age_raw$feature_value <- as.numeric(age_raw$feature_value)
age_stat <- c(min(age_raw$feature_value),median(age_raw$feature_value), max(age_raw$feature_value))
gender_raw <- df[df$feature_name == "Gender",] #6565 obs
gender_stat <- c(sum(gender_raw$feature_value=="M")/nrow(gender_raw),sum(gender_raw$feature_value=="F")/nrow(gender_raw))
onsetdelta_raw <- df[df$feature_name == "onset_delta",] #4985 obs
patient_with_postive_onsetdelta <- onsetdelta_raw[onsetdelta_raw$feature_value == "184",]
onsetdelta_raw$feature_value <- as.numeric(onsetdelta_raw$feature_value)
onsetdelta_stat <- c(min(onsetdelta_raw$feature_value),median(onsetdelta_raw$feature_value), max(onsetdelta_raw$feature_value))
onset_raw <- df[df$feature_name == "onset_site",] #7351 obs
onset_stat <- table(onset_raw$feature_value)

0.3 Predictive Analytics

0.3.1 Analysis

0.3.1.1 Linear Regression

training <- read.csv("complete_training.csv",header=TRUE)
testing <- read.csv("complete_testing.csv",header=TRUE)
obj<-lm(ALSFRS_slope~.,data=training[,-93]) 
obj2<-step(obj, trace = FALSE) 
y_p<-predict(obj2,testing) 
test.y<-testing$ALSFRS_slope 
write.csv(y_p,"LR_predict.csv", row.names = FALSE)

0.3.1.2 Random forest

set.seed(7788888)
#load datasets
df_train <- read.csv("complete_training.csv")
df_test <- read.csv("complete_testing.csv")

#data type transformation
df_train$onset_site_mean <- as.factor(df_train$onset_site_mean)
df_test$onset_site_mean <- as.factor(df_test$onset_site_mean)
levels(df_train$onset_site_mean)[4] <- "4"

df_train$Gender_mean <- as.factor(df_train$Gender_mean)
df_test$Gender_mean <- as.factor(df_test$Gender_mean)

rf.fit <-  train(ALSFRS_slope~., data = df_train[,-93], method = "rf", trControl = 
                   trainControl(method = "cv", number = 5), allowParallel=TRUE)

rf.result <- data.frame(df_test$ALSFRS_slope)
colnames(rf.result)[1] <- "ALSFRS_slope"
rf.result$prediction <- predict(rf.fit, df_test)

write.csv(rf.result,"RF_predict.csv",row.names = FALSE)

0.3.1.3 Bayesian Additive Regression Trees (BART)

#########BART###############
# set_bart_machine_memory(10000)
set.seed(614)
data1<-read.csv('complete_training.csv',header=TRUE)
train.x<-data1[,-c(6,93)]
train.y<-data1[,6]
data2<-read.csv('complete_testing.csv',header=TRUE)
test.x<-data2[,-c(6,93)]
test.y<-data2[,6]
options(java.parameters="-Xmx68000m")
set_bart_machine_num_cores(8)
bart_machine_cv <- bartMachineCV(train.x, train.y, mem_cache_for_speed = FALSE)##CV
summary(bart_machine_cv)
y_p<-bart_predict_for_test_data(bart_machine_cv,test.x,test.y)$y_hat
write.csv(y_p,"bart_predict.csv")

0.3.1.4 Super Learner

set.seed(23432)
data1<-read.csv("complete_training.csv", header =TRUE)
train.x<-data1[,-c(6,93)]
train.y<-data1[,6]
data2<-read.csv("complete_testing.csv",header=TRUE)
test.x<-data2[,-c(6,93)]
test.y<-data2[,6]
test.x<-test.x[,order(colnames(test.x))]
train.x<-train.x[,order(colnames(train.x))]

# generate Library and run Super Learner
SL.library <- c("SL.svm", "SL.ridge", "SL.randomForest","SL.mean","SL.caret", "SL.rpart","SL.stepAIC","SL.step.forward", "SL.step")

test_predict <- SampleSplitSuperLearner(Y = train.y, X = train.x, newX = test.x, SL.library = SL.library, verbose = FALSE, method = "method.NNLS")
## + Fold01: mtry= 2 
## - Fold01: mtry= 2 
## + Fold01: mtry=50 
## - Fold01: mtry=50 
## + Fold01: mtry=98 
## - Fold01: mtry=98 
## + Fold02: mtry= 2 
## - Fold02: mtry= 2 
## + Fold02: mtry=50 
## - Fold02: mtry=50 
## + Fold02: mtry=98 
## - Fold02: mtry=98 
## + Fold03: mtry= 2 
## - Fold03: mtry= 2 
## + Fold03: mtry=50 
## - Fold03: mtry=50 
## + Fold03: mtry=98 
## - Fold03: mtry=98 
## + Fold04: mtry= 2 
## - Fold04: mtry= 2 
## + Fold04: mtry=50 
## - Fold04: mtry=50 
## + Fold04: mtry=98 
## - Fold04: mtry=98 
## + Fold05: mtry= 2 
## - Fold05: mtry= 2 
## + Fold05: mtry=50 
## - Fold05: mtry=50 
## + Fold05: mtry=98 
## - Fold05: mtry=98 
## + Fold06: mtry= 2 
## - Fold06: mtry= 2 
## + Fold06: mtry=50 
## - Fold06: mtry=50 
## + Fold06: mtry=98 
## - Fold06: mtry=98 
## + Fold07: mtry= 2 
## - Fold07: mtry= 2 
## + Fold07: mtry=50 
## - Fold07: mtry=50 
## + Fold07: mtry=98 
## - Fold07: mtry=98 
## + Fold08: mtry= 2 
## - Fold08: mtry= 2 
## + Fold08: mtry=50 
## - Fold08: mtry=50 
## + Fold08: mtry=98 
## - Fold08: mtry=98 
## + Fold09: mtry= 2 
## - Fold09: mtry= 2 
## + Fold09: mtry=50 
## - Fold09: mtry=50 
## + Fold09: mtry=98 
## - Fold09: mtry=98 
## + Fold10: mtry= 2 
## - Fold10: mtry= 2 
## + Fold10: mtry=50 
## - Fold10: mtry=50 
## + Fold10: mtry=98 
## - Fold10: mtry=98 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 50 on full training set
## + Fold01: mtry= 2 
## - Fold01: mtry= 2 
## + Fold01: mtry=50 
## - Fold01: mtry=50 
## + Fold01: mtry=98 
## - Fold01: mtry=98 
## + Fold02: mtry= 2 
## - Fold02: mtry= 2 
## + Fold02: mtry=50 
## - Fold02: mtry=50 
## + Fold02: mtry=98 
## - Fold02: mtry=98 
## + Fold03: mtry= 2 
## - Fold03: mtry= 2 
## + Fold03: mtry=50 
## - Fold03: mtry=50 
## + Fold03: mtry=98 
## - Fold03: mtry=98 
## + Fold04: mtry= 2 
## - Fold04: mtry= 2 
## + Fold04: mtry=50 
## - Fold04: mtry=50 
## + Fold04: mtry=98 
## - Fold04: mtry=98 
## + Fold05: mtry= 2 
## - Fold05: mtry= 2 
## + Fold05: mtry=50 
## - Fold05: mtry=50 
## + Fold05: mtry=98 
## - Fold05: mtry=98 
## + Fold06: mtry= 2 
## - Fold06: mtry= 2 
## + Fold06: mtry=50 
## - Fold06: mtry=50 
## + Fold06: mtry=98 
## - Fold06: mtry=98 
## + Fold07: mtry= 2 
## - Fold07: mtry= 2 
## + Fold07: mtry=50 
## - Fold07: mtry=50 
## + Fold07: mtry=98 
## - Fold07: mtry=98 
## + Fold08: mtry= 2 
## - Fold08: mtry= 2 
## + Fold08: mtry=50 
## - Fold08: mtry=50 
## + Fold08: mtry=98 
## - Fold08: mtry=98 
## + Fold09: mtry= 2 
## - Fold09: mtry= 2 
## + Fold09: mtry=50 
## - Fold09: mtry=50 
## + Fold09: mtry=98 
## - Fold09: mtry=98 
## + Fold10: mtry= 2 
## - Fold10: mtry= 2 
## + Fold10: mtry=50 
## - Fold10: mtry=50 
## + Fold10: mtry=98 
## - Fold10: mtry=98 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 50 on full training set
y_p_sl<-test_predict$SL.predict
write.csv(y_p_sl,"SL_predict.csv",row.names = FALSE)

0.3.2 Measurement of accuracy between 4 methods

##   measurement  LinearRegression     RandomForests              BART
## 1   R-squared 0.562409822938255   0.6880210876278 0.680649317033897
## 2        RMSE 0.426882827349752 0.360443768743188 0.364677380647518
## 3 Correlation 0.758198331190405 0.831666868731394 0.828442892519017
##        SuperLearner
## 1 0.655100925359185
## 2 0.378984036164353
## 3 0.819548257119423

0.4 Adverse Event

0.4.1 Extract useful information from adverse events

0.4.1.1 Adverse Events in raw training dataset

##train_AE.csv obtained by excel
df1<-read.csv("Data/train_AE.csv", header=TRUE, fill = TRUE,  stringsAsFactors = FALSE)
ID<-grep("Gastrointestinal",df1$feature_name)

df1$resp<-0

ID1<-intersect(grep("failure",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID2<-intersect(grep("Choking",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID3<-intersect(grep("insufficiency",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID4<-intersect(grep("Orthopnea",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID5<-intersect(grep("distress",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID6<-intersect(grep("Hoarse voice",df1$feature_value),grep("Respiratory",df1$feature_name)) 

df1$resp[ID1]<-1
df1$resp[ID2]<-df1$resp[ID2]+1
df1$resp[ID3]<-df1$resp[ID3]+1
df1$resp[ID4]<-df1$resp[ID4]+1
df1$resp[ID5]<-df1$resp[ID5]+1
df1$resp[ID6]<-df1$resp[ID6]+1

df1$infection<-0
ID1<-intersect(grep("upper respiratory",df1$feature_value,ignore.case = TRUE),grep("infection",df1$feature_name,ignore.case = TRUE)) 
df1$infection[ID1]<-1

df1$muscle<-0
ID1<-intersect(grep("weakness",df1$feature_value,ignore.case = TRUE),grep("muscle",df1$feature_name,ignore.case = TRUE)) 
df1$muscle[ID1]<-1

df1$other<-0
ID1<-grep("weight decrease",df1$feature_value,ignore.case = TRUE) 
df1$other[ID1]<-1

df1$injuries<-0
ID1<-intersect(grep("fall",df1$feature_value,ignore.case = TRUE),grep("injuries",df1$feature_name,ignore.case = TRUE)) 
df1$injuries[ID1]<-1

df1$move<-0
ID1<-intersect(grep("tremor",df1$feature_value,ignore.case = TRUE),grep("movement",df1$feature_name,ignore.case = TRUE)) 
df1$move[ID1]<-1

df1$musculoskeletal<-0
ID1<-intersect(grep("chewing",df1$feature_value,ignore.case = TRUE),grep("musculoskeletal",df1$feature_name,ignore.case = TRUE)) 
df1$musculoskeletal[ID1]<-1

df1$neurological<-0
ID1<-intersect(grep("speech",df1$feature_value,ignore.case = TRUE),grep("neurological",df1$feature_name,ignore.case = TRUE)) 
df1$neurological[ID1]<-1

df1$neuromuscular<-0
ID1<-intersect(grep("spasticity",df1$feature_value,ignore.case = TRUE),grep("neuromuscular",df1$feature_name,ignore.case = TRUE)) 
df1$neuromuscular[ID1]<-1

df1$cranial_nerve<-0
ID1<-intersect(grep("paralysis",df1$feature_value,ignore.case = TRUE),grep("cranial nerve",df1$feature_name,ignore.case = TRUE)) 
df1$cranial_nerve[ID1]<-1

df1$neurological<-0
ID1<-intersect(grep("dysarthria",df1$feature_value,ignore.case = TRUE),grep("neurological",df1$feature_name,ignore.case = TRUE)) 
df1$neurological[ID1]<-1


df1$peripheral_neuropath<-0
ID1<-intersect(grep("foot drop",df1$feature_value,ignore.case = TRUE),grep("peripheral neuropath",df1$feature_name,ignore.case = TRUE)) 
df1$peripheral_neuropath[ID1]<-1

df1<-as.data.frame(df1)

df_disease1<-df1[,c(2,6:16)]
aaa1<-rowsum(df_disease1,df_disease1$SubjectID)
aaa1$SubjectID<-as.numeric(rownames(aaa1))
write.csv(aaa1,"AE_mapping_train_wizID.csv", row.names = FALSE)

0.4.1.2 Adverse Events in raw testing dataset

##test_AE.csv obtained by excel
df1<-read.csv("Data/test_AE.csv", header=TRUE, fill = TRUE,  stringsAsFactors = FALSE)
ID<-grep("Gastrointestinal",df1$feature_name)

df1$resp<-0

ID1<-intersect(grep("failure",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID2<-intersect(grep("Choking",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID3<-intersect(grep("insufficiency",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID4<-intersect(grep("Orthopnea",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID5<-intersect(grep("distress",df1$feature_value),grep("Respiratory",df1$feature_name)) 
ID6<-intersect(grep("Hoarse voice",df1$feature_value),grep("Respiratory",df1$feature_name)) 

df1$resp[ID1]<-1
df1$resp[ID2]<-df1$resp[ID2]+1
df1$resp[ID3]<-df1$resp[ID3]+1
df1$resp[ID4]<-df1$resp[ID4]+1
df1$resp[ID5]<-df1$resp[ID5]+1
df1$resp[ID6]<-df1$resp[ID6]+1

df1$infection<-0
ID1<-intersect(grep("upper respiratory",df1$feature_value,ignore.case = TRUE),grep("infection",df1$feature_name,ignore.case = TRUE)) 
df1$infection[ID1]<-1

df1$muscle<-0
ID1<-intersect(grep("weakness",df1$feature_value,ignore.case = TRUE),grep("muscle",df1$feature_name,ignore.case = TRUE)) 
df1$muscle[ID1]<-1

df1$other<-0
ID1<-grep("weight decrease",df1$feature_value,ignore.case = TRUE) 
df1$other[ID1]<-1

df1$injuries<-0
ID1<-intersect(grep("fall",df1$feature_value,ignore.case = TRUE),grep("injuries",df1$feature_name,ignore.case = TRUE)) 
df1$injuries[ID1]<-1

df1$move<-0
ID1<-intersect(grep("tremor",df1$feature_value,ignore.case = TRUE),grep("movement",df1$feature_name,ignore.case = TRUE)) 
df1$move[ID1]<-1

df1$musculoskeletal<-0
ID1<-intersect(grep("chewing",df1$feature_value,ignore.case = TRUE),grep("musculoskeletal",df1$feature_name,ignore.case = TRUE)) 
df1$musculoskeletal[ID1]<-1

df1$neurological<-0
ID1<-intersect(grep("speech",df1$feature_value,ignore.case = TRUE),grep("neurological",df1$feature_name,ignore.case = TRUE)) 
df1$neurological[ID1]<-1

df1$neuromuscular<-0
ID1<-intersect(grep("spasticity",df1$feature_value,ignore.case = TRUE),grep("neuromuscular",df1$feature_name,ignore.case = TRUE)) 
df1$neuromuscular[ID1]<-1

df1$cranial_nerve<-0
ID1<-intersect(grep("paralysis",df1$feature_value,ignore.case = TRUE),grep("cranial nerve",df1$feature_name,ignore.case = TRUE)) 
df1$cranial_nerve[ID1]<-1

df1$neurological<-0
ID1<-intersect(grep("dysarthria",df1$feature_value,ignore.case = TRUE),grep("neurological",df1$feature_name,ignore.case = TRUE)) 
df1$neurological[ID1]<-1

df1$peripheral_neuropath<-0
ID1<-intersect(grep("foot drop",df1$feature_value,ignore.case = TRUE),grep("peripheral neuropath",df1$feature_name,ignore.case = TRUE)) 
df1$peripheral_neuropath[ID1]<-1

df1<-as.data.frame(df1)

df_disease2<-df1[,c(2,8:18)]
aaa2<-rowsum(df_disease2,df_disease2$SubjectID)
aaa2$SubjectID<-as.numeric(rownames(aaa2))
write.csv(aaa2,"AE_mapping_test_wizID.csv", row.names = FALSE)

0.4.1.3 aggregate adverse events information

training<-read.csv("complete_training.csv",header=TRUE)
testing<-read.csv("complete_testing.csv",header=TRUE)

training_ae<-read.csv("AE_mapping_train_wizID.csv",header=TRUE)
testing_ae<-read.csv("AE_mapping_test_wizID.csv",header=TRUE)

train<-merge(training,training_ae,by="SubjectID")
test<-merge(testing,testing_ae,by="SubjectID")

score_ae<-apply(train[,101:111],1,sum)
train$score_ae<-score_ae

score_ae<-apply(test[,101:111],1,sum)
test$score_ae<-score_ae

write.csv(train,"training_wizAE.csv", row.names = FALSE)
write.csv(test,"testing_wizAE.csv", row.names = FALSE)

0.4.1.4 Statistical difference between features in the subjects with and without adverse events in complete training dataset(Wilcoxon-Mann-Whitney test for continuous covariates and Chi-squared test for categorical covariates)

df_t <- read.csv("training_wizAE.csv")
df_t$AE <- 1 #1519
df_t[df_t$score_ae == 0,]$AE <- 0 #669
AE_group <- which(df_t$AE==1)
NOAE_group <- which(df_t$AE==0)

FeatureIndex <- setdiff(c(1:113), c(1,48,74,101:113)) #SubjectID(1),Gender_mean(48),onset_site_mean(74), AE(101:113)
df_t2 <- df_t[,-c(1,48,74,101:113)]

wTest.tbl <- data.frame(FeatureIndex, names(df_t)[FeatureIndex], c(rep(NA, length(FeatureIndex))), c(rep(NA, length(FeatureIndex))), c(rep(NA, length(FeatureIndex))), c(rep(NA,length(FeatureIndex))))

colnames(wTest.tbl)[1:6] <- c("Feature.Index","FeatureName","Wilcoxon.pvalue","FDR.corrected.pvalue","W.statistic", "Significance")
#test
for (i in 1:ncol(df_t2)){
    t.result <- wilcox.test(df_t[AE_group, i], df_t[NOAE_group, i], paired = FALSE)
    wTest.tbl[i,3] <- round(t.result$p.value, digits = 4)
    wTest.tbl[i,5] <- round(t.result$statistic)
}
wTest.tbl$FDR.corrected.pvalue <- round(p.adjust(wTest.tbl$Wilcoxon.pvalue, "BH",n = length(wTest.tbl$Wilcoxon.pvalue)), digits=4)

wTest.tbl[wTest.tbl$FDR.corrected.pvalue < (0.05),]$Significance <- "*"
wTest.tbl[wTest.tbl$FDR.corrected.pvalue < (0.01),]$Significance <- "**"
wTest.tbl[wTest.tbl$FDR.corrected.pvalue < (0.001),]$Significance <- "***"

wTest.tbl2 <- wTest.tbl[!is.na(wTest.tbl$Significance), ]

write.csv(wTest.tbl2,"wilcoxonTest_AE.csv", row.names = FALSE)

###########

#Chi-squared test for Gender and onset_site
#Gender
tbl_gender <- table(df_t$AE,df_t$Gender_mean)
chisq.test(tbl_gender) #Not significant p=0.05765
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl_gender
## X-squared = 3.6037, df = 1, p-value = 0.05765
#Onset_site
tbl_site <- table(df_t$AE,df_t$onset_site_mean)
chisq.test(tbl_site) #Significant p=0.001583
## 
##  Pearson's Chi-squared test
## 
## data:  tbl_site
## X-squared = 12.897, df = 2, p-value = 0.001583

0.4.2 Investigation on variable importance(with/without adverse events)

0.4.2.1 RandomForests

#load datasets
df_train <- read.csv("training_wizAE.csv")
df_test <- read.csv("testing_wizAE.csv")
#remove unnecessary columns
df_train <- df_train[,-1]
df_test <- df_test[,-1]
#data type transformation?(onset_site_mean, Gender_mean)
df_train$onset_site_mean <- as.factor(df_train$onset_site_mean)
df_test$onset_site_mean <- as.factor(df_test$onset_site_mean)
levels(df_train$onset_site_mean)[4] <- "4"

df_train$Gender_mean <- as.factor(df_train$Gender_mean)
df_test$Gender_mean <- as.factor(df_test$Gender_mean)
################RandomForests#######################
set.seed(49239)
#without AE
rf.fit1 <-  train(ALSFRS_slope~., data = df_train[,-c(100:111)], method = "rf", trControl = 
                     trainControl(method = "cv", number = 5), allowParallel=TRUE, importance = TRUE)
#with AE
rf.fit2 <-  train(ALSFRS_slope~., data = df_train, method = "rf", trControl = 
                      trainControl(method = "cv", number = 5), allowParallel=TRUE, importance = TRUE)

#obtain importance measures of features in the fitted models
rf_noAE.ImpMeasure <- data.frame(varImp(rf.fit1, scale = FALSE)$importance)
rf_noAE.ImpMeasure$Vars <- row.names(rf_noAE.ImpMeasure)
rf_noAE.ImpMeasure <- rf_noAE.ImpMeasure[order(-rf_noAE.ImpMeasure$Overall),]


rf_AE.ImpMeasure <- data.frame(varImp(rf.fit2,scale = FALSE)$importance)
rf_AE.ImpMeasure$Vars <- row.names(rf_AE.ImpMeasure)
rf_AE.ImpMeasure <- rf_AE.ImpMeasure[order(-rf_AE.ImpMeasure$Overall),]


#output
write.csv(rf_noAE.ImpMeasure, "RF_ImpMeasure_withoutAE.csv", row.names = FALSE)
write.csv(rf_AE.ImpMeasure, "RF_ImpMeasure_withAE.csv", row.names = FALSE)

0.4.2.2 BART

options(java.parameters="-Xmx68000m")
library(bartMachine)
set.seed(614)
#without AE
data1<-read.csv('training_wizAE.csv',header=TRUE)
train.x<-data1[,-c(1,7,101:112)]
train.y<-data1[,7]
data2<-read.csv('testing_wizAE.csv',header=TRUE)
test.x<-data2[,-c(1,7,101:112)]
test.y<-data2[,7]
set_bart_machine_num_cores(8)
bart_machine_cv <- bartMachineCV(train.x, train.y, mem_cache_for_speed = FALSE)##CV
aaa1<-investigate_var_importance(bart_machine_cv,type="splits", plot=FALSE, num_replicates_for_avg=5,num_trees_bottleneck=20)
score1<-aaa1$avg_var_props
write.csv(score1,"BART_ImpMeasure_withoutAE.csv")
#with AE
data1<-read.csv('training_wizAE.csv',header=TRUE)
train.x<-data1[,-c(1,7)]
train.y<-data1[,7]
data2<-read.csv('testing_wizAE.csv',header=TRUE)
test.x<-data2[,-c(1,7)]
test.y<-data2[,7]
set_bart_machine_num_cores(8)
bart_machine_cv <- bartMachineCV(train.x, train.y, mem_cache_for_speed = FALSE)##CV
aaa2<-investigate_var_importance(bart_machine_cv,type="splits", plot=FALSE, num_replicates_for_avg=5,num_trees_bottleneck=20)
score2<-aaa2$avg_var_props
write.csv(score2,"BART_ImpMeasure_withAE.csv")

0.5 Miscellaneous

0.5.0.1 BMI with and without Riluzole Treatment(Figure 4)

#obtain subjects with BMI obs and retain those with >=2 obs(as we want to see the trend)
df_bmi<-df[df$feature_name=="BMI",]
df_bmi$feature_value<-as.numeric(df_bmi$feature_value)
df_bmi$feature_delta<-as.numeric(df_bmi$feature_delta)
sID<-as.data.frame(table(df_bmi$SubjectID))
ID<-as.numeric(as.character(sID$Var1[sID$Freq>1]))
df_bmi_1<-df_bmi[df_bmi$SubjectID %in% ID, ]

#obatin subjectID information(with or without Riluzole treatment)
riluzole_yes <- df[df$feature_name=="if_use_Riluzole" & df$feature_value == "Yes", ]
riluzole_no <- df[df$feature_name=="if_use_Riluzole" & df$feature_value == "No", ]

#keep the study subjects with both BMI(n>1) and Riluzole treatment information
bmi_riluzole <- df_bmi_1[df_bmi_1$SubjectID %in% riluzole_yes$SubjectID, ] #Number of Patients=252, obs=737
bmi_no_riluzole <- df_bmi_1[df_bmi_1$SubjectID %in% riluzole_no$SubjectID, ] #Number of Patients=45, obs=311

#Wilcoxon-Mann-Whitney test
bmi_mean_riluzole <- ddply(bmi_riluzole,~SubjectID,summarise,mean=mean(feature_value))
bmi_mean_noriluzole <- ddply(bmi_no_riluzole,~SubjectID,summarise,mean=mean(feature_value))

t.result <- wilcox.test(bmi_mean_riluzole$mean,bmi_mean_noriluzole$mean)#W = 3027, p-value = 6.373e-07

png("BMI with Riluzole.png", width=6000, height= 4000, res = 300)
g1 <- ggplot(bmi_riluzole[bmi_riluzole$feature_delta<300,], aes(feature_delta, feature_value*10000)) + geom_point()
g1 <- g1 + geom_smooth(method = "loess")
g1 <- g1 + xlab("Days") + ylab("BMI") + ggtitle("BMI for Patients with Riluzole Treatment")
g1 <- g1 + theme(plot.title = element_text(size=45, face = "bold"), axis.text.x = element_text(size=16),axis.text.y =element_text(size=16),axis.title=element_text(size=30, face = "bold"))
g1 <- g1 + scale_x_continuous(limits = c(0, 250)) + scale_y_continuous(limits = c(15, 45))
print(g1)
dev.off()
## quartz_off_screen 
##                 2
png("BMI without Riluzole.png", width=6000, height= 4000, res = 300)
g2 <- ggplot(bmi_no_riluzole, aes(feature_delta, feature_value*10000)) + geom_point()
g2 <- g2 + geom_smooth(method = "loess")
g2 <- g2 + xlab("Days") + ylab("BMI") + ggtitle("BMI for Patients without Riluzole Treatment")
g2 <- g2 + theme(plot.title = element_text(size=45, face = "bold"), axis.text.x = element_text(size=16),axis.text.y =element_text(size=16),axis.title=element_text(size=30, face = "bold"))
g2 <- g2 + scale_x_continuous(limits = c(0, 250)) + scale_y_continuous(limits = c(15, 45))
print(g2)
dev.off()
## quartz_off_screen 
##                 2

#### Density plots of 33 features comparing patients with AEs and patients without AEs(Table S2, The results are not shown in this markdown)####

## [1] 5
## [1] "Albumin_min"

## [1] 8
## [1] "ALSFRS_Total_max"

## [1] 11
## [1] "ALSFRS_Total_slope"

## [1] 12
## [1] "ALT.SGPT._max"

## [1] 13
## [1] "ALT.SGPT._median"

## [1] 17
## [1] "AST.SGOT._median"

## [1] 20
## [1] "Bicarbonate_max"

## [1] 21
## [1] "Bicarbonate_median"

## [1] 22
## [1] "Bicarbonate_min"

## [1] 23
## [1] "Bicarbonate_slope"

## [1] 33
## [1] "bp_systolic_median"

## [1] 34
## [1] "bp_systolic_min"

## [1] 35
## [1] "bp_systolic_slope"

## [1] 38
## [1] "Calcium_min"

## [1] 39
## [1] "Calcium_slope"

## [1] 40
## [1] "Chloride_max"

## [1] 49
## [1] "Glucose_max"

## [1] 51
## [1] "Glucose_min"

## [1] 52
## [1] "Glucose_slope"

## [1] 59
## [1] "Hematocrit_min"

## [1] 70
## [1] "mouth_median"

## [1] 71
## [1] "mouth_min"

## [1] 72
## [1] "mouth_slope"

## [1] 73
## [1] "onset_delta_mean"

## [1] 75
## [1] "Platelets_max"

## [1] 77
## [1] "Platelets_min"

## [1] 84
## [1] "pulse_min"

## [1] 85
## [1] "pulse_slope"

## [1] 90
## [1] "Sodium_max"

## [1] 91
## [1] "Sodium_median"

## [1] 92
## [1] "Sodium_min"

## [1] 95
## [1] "trunk_median"

## [1] 96
## [1] "trunk_min"